#!/usr/bin/env python
#coding=utf-8
"""

从字符串中提取日期，格式化日期

author:rocky
email:wu.zheng@socialcredits.cn

"""

import re
import copy
import datetime
import warnings
DATE_FORMAT= '%Y-%m-%d %H:%M:%S'         #日期格式

def extract_chinese_date(text,is_str=False):
    """从字符串中text 中提取中文格式的日期

    从字符串中识别"二〇一五年七月三十日"这样格式的日期,
    并返回%Y-%m-%d %M:%H:%S格式的ISO日期对象,时间部分默认0点
    如果没找到需要的日期格式则反回None
    输入字符串需要unicode编码
    example:
        test_str = u'我是测试字符串网二〇一五年七月三十日中国共产'
        print date_cleaner.extract_chinese_date(test_str)

    """
    if not text :
        warnings.warn("can't find  date because input text is null")
        return ''
    if not isinstance(text,basestring):
        raise Exception(u'text is need str or unicode but %s'%type(text))

    if not isinstance(text,unicode):
        text = text.decode(encoding='utf-8')
    text = text.replace(u'零',u'〇')
    text = text.replace(u'○', u'〇')
    text = re.search(u'[〇一二三四五六七八九十]{4}年[〇一二三四五六七八九十]+月[〇一二三四五六七八九十]+日',text)
    if not text:
        warnings.warn("failed find date pattern and  return None")
        return ''
    text = text.group()
    chinese_data = [u'〇', u'一', u'二', u'三', u'四', u'五', u'六', u'七', u'八', u'九', u'十']
    year = text.split(u'年')[0]
    month = text.split(u'年')[1].split(u'月')[0]
    day = text.split(u'月')[1].split(u'日')[0]
    year = ''.join([str(chinese_data.index(w)) for w in year])
    month = ''.join([str(chinese_data.index(w)) for w in month])
    day = ''.join([str(chinese_data.index(w)) for w in day])
    if len(month)>=3:
        month = month[0]+month[-1]
    if len(month)==1:
        month = '0'+month
    if len(day)>=3:
        day = day[0]+day[-1]
    if len(day)==1:
        day = '0'+day
    date_str = year+'-'+month+'-'+day+' 00:00:00'
    date_result = datetime.datetime.strptime(date_str,DATE_FORMAT)
    if is_str:
        return date_result.strftime('%Y-%m-%d %H:%M:%S') if date_result else ''
    return date_result


def format_date_str(text,isall=0):
    """从字符串中取出第一个用%Y-%m-%d %H:%M:%S格式的日期

    允许时间部分的缺失,如'2015-1-1'和'2015-01-01 3:20'都是允许的如果缺失自动补零
    """
    date_match = re.search(u'((?P<year>\d{4})-(?P<month>\d{0,2})-(?P<day>\d{0,2})) ?((?P<hour>\d{0,2})?:?(?P<minute>\d{0,2})?:?(?P<second>\d{0,2})?)?',text)
    if not date_match:
        warnings.warn('failed find date pattern and return None')
        return None
    match_dict = date_match.groupdict()
    for k,v in match_dict.items():
        if not v:
            match_dict[k] = '00'
        elif len(v)==1:
            match_dict[k] = '0'+v
    datepart_str = '-'.join([match_dict.get('year'),match_dict.get('month'),match_dict.get('day')] )
    timepart_str = ':'.join([match_dict.get('hour'),match_dict.get('minute'),match_dict.get('second')])
    result_str = ' '.join([datepart_str,timepart_str])
    try:
        result_date =  datetime.datetime.strptime(result_str,'%Y-%m-%d %H:%M:%S')
    except Exception,e:
        result_date = None
    return result_date


def extract_first_date(text,is_str=False):
    """提取字符串中的第一个日期

    支持常见的日期格式的分隔符号[年,\,-,.]作为日期部分的分隔符,[点:]作为时间部分的分隔符号

    """
    text = copy.deepcopy(text)
    if not text :
        warnings.warn("can't find date because input text is null")
        return ''
    if not isinstance(text,basestring):
        raise Exception(u'text is need str or unicode but %s'%type(text))

    if not isinstance(text,unicode):
        text = text.decode(encoding='utf-8')

    text = re.sub(u'[年月/\.]','-',text)
    text = re.sub(u'[日号]',' ',text)
    text = re.sub(u'[时点分]',':',text)
    text = re.sub(u'[秒]','',text)

    if len(text.strip())==8 and re.search(u'\d{8}',text):
        text = re.search(u'\d{8}',text)
        if text :
            text = text.group()
            text = text[:4]+"-"+text[4:6]+"-"+text[6:]
           
    result_date = format_date_str(text)

    if is_str:
        return result_date.strftime('%Y-%m-%d %H:%M:%S') if result_date else ''
    return result_date

if __name__ == '__main__':
    test = u'发大水二〇一二年九月二十六日上午九时三十分'
    text = extract_chinese_date(u"二○一七年三月六日下午十五点四十五分在本院第三十一审判庭依法审理本案")
    # print extract_chinese_date(text, is_str=True)
    # print extract_first_date(u'2015.09.09')
    print text